* Set up log
cd $ohie
cap log close
global sysdate: disp %tdYYNNDD  date("`c(current_date)'", "DMY")
qui log using 	"./logs/ohie_data_setup_$sysdate.log", replace

* Set up timer
timer clear 1
timer on 1 

/*----------------------------------------------------------------------*/
/* PROGRAM: ohie_data_setup.do						*/
/*									*/
/* PURPOSE:								*/
/* [*]	This code creates the analytic data set for the OHIE data	*/
/*	for the 1 lottery entrant sample, which is the main working	*/
/*	sample in the paper. The majority of the subsequent do-files	*/
/* 	use this as the input data set.				 	*/
/*									*/
/* NOTES:								*/
/* [*] 	Some general information about the randomization and lottery, 	*/
/*	pg. 2 from the OHIE user guide: "In 2008, Oregon selected 	*/
/*	roughly 30,000 individuals by lottery from a waiting list of	*/
/*	about 90,000 for an otherwise closed Medicaid program. The state*/
/*	conducted eight lottery	drawings from March through September	*/
/*	2008. Selected individuals won the opportunity - for themselves */
/*	and any household member - to apply for health insurance 	*/
/*	benefits through a Medicaid program called Oregon Health Plan 	*/
/*	Standard (OHP Standard). OHP Standard provides 	benefits to 	*/
/*	low-income adults who are not categorically eligible for	*/
/*	Oregon's traditional Medicaid program (OHP Plus); to be eligible*/
/*	individuals must be adults ages 19-64, not otherwise eligible 	*/
/*	for Medicaid or other public insurance, Oregon residents, U.S. 	*/
/*	citizens or legal immigrants, have been without health 		*/
/*	insurance for six months, have income below the	federal poverty */
/*	level, and have assets below $2,000. The randomly selected 	*/
/*	individuals chosen by the lottery who completed the application	*/
/*	process and met the eligibility criteria were enrolled in OHP 	*/
/*	Standard.							*/
/* [*]	Even though most of the .do files and the results of the paper 	*/
/*	are based on the 1 lottery entrant sample, which is defined as 	*/
/*	our baseline sample, we also create the analytic data set for  	*/
/*	the full sample that can be used for replicating the Taubman 	*/
/*	2014 results.							*/
/*									*/
/* OUTPUT:								*/
/* [*]	oregonnumhh1.dta: This is the analytic data set for the OHIE 1 	*/
/*	lottery sample and will be used throughout all .do files as an	*/
/*	input data set.							*/
/* [*]	oregonpooled.dta: This is the analytic data set for the full 	*/
/*	OHIE sample.							*/
/*									*/
/*----------------------------------------------------------------------*/
	
* Set up display options
clear
set type double
set more off, permanently
		
*********************************************
* SPECIFY INPUT FILES		*
*********************************************

* Descriptive variables raw data
local data_descr	"$dofiles/oregonhie_descriptive_vars.dta"

* ER utilization raw data
local data_ed		"$dofiles/oregonhie_ed_vars.dta"

* State programs raw data
local data_stprog	"$dofiles/oregonhie_stateprograms_vars.dta"

* 0m survey
local data_surv0m	"$dofiles/oregonhie_survey0m_vars.dta"
	
*********************************************
* MACROS AND SWITCHES			*
*********************************************
	
* Outcomes
local Ys	"Y_any Y_num Y_charges"

* Endogenous variable
local D 	"any_medicaid"

* Instrument
local Z 	"Z"	

* Pre-period measures
local Y_pre	"Y_any_pre Y_num_pre Y_num_pre0 Y_num_pre1 Y_num_pre2 Y_num_pre3 Y_num_pre4 Y_num_pre1_3 Y_num_pre2_3 Y_pre_bi1 Y_pre_bi2 Y_pre_bi3 Y_pre_bi4 Y_charges_pre"

* Common covariates
local X 	"age female english"
	* Categorical common covariates
	local X_byte 	"female english"
	* Continuous common covariates	
	local X_cont 	"age"			
	
* Frequency weight variable
* This variable is a special case. Read the section where w is created in 
* this .do file for more information. 
local wt 	"w"

* Cluster variable
local cluster 	"household_id"

* This is a list of all variables to keep in the analytic data sets
local varlist "person_id `Ys' `D' `Z' `Y_pre' `X' `wt' `cluster' numhh first_day_list snap_ever_prenotify07"
local varlist "`varlist' tanf_ever_prenotify07 applied_app approved_app selfhealth*"

*********************************************
* OREGON DATA			*
*********************************************
	
* Merge all raw data together
use "`data_descr'", clear
merge 1:1 person_id using "`data_ed'"
keep if _merge == 3
drop _merge
merge 1:1 person_id using "`data_stprog'"
keep if _merge == 3
drop _merge
merge 1:1 person_id using "`data_surv0m'", nogen keep(match master)
		
* Define Y, Z, and D

* Define Y
* All outcome variables are measured during the study period, defined from March 
* 10, 2008 to September 30, 2009

* Y_any
* This is a binary variable that measures whether an individual had any ER 
* visits in the study period and was defined using the variable any_visit_ed 
* from the OHIE raw data. This variable does not contain any missing values.
gen Y_any = any_visit_ed
	
* Y_num
* This is a discrete variable that measures the number of ER visits an 
* individual had in the study period and was defined using the variable 
* num_visit_cens_ed from the OHIE raw data. This variable has been censored in 
* the public access OHIE files in the following manner, as indicated in the 
* OHIE documentation files: "This variable was truncated at 2*99th percentile 
* of the original distribution (conditional on being non-zero). The public use 
* variable was additionally censored to ensure de-identification (see User Guide 
* for explanation)." The User Guide further describes the censoring as follows:
* "Most continuous variables (e.g., that capture the total number of ED visits 
* in a given period) have been censored to ensure de-identification. Variables 
* were censored so that no individual value has fewer than ten observations. 
* This results in the right tail of the distribution of these variables being
* grouped into one large upper bin. This variable contains missing values.

gen Y_num = num_visit_cens_ed
	
* Y_charges
* This is a continuous variable measuring the total ER charges in the study 
* period and was defined using the variable charg_tot_ed from the OHIE raw data. 
* This variable was also truncated: "This variable was truncated at 2*99th 
* percentile of the original distribution (conditional on being non-zero).
* charg_tot_ed includes not only charges incurred in the ED, but it also 
* includes costs of any subsequent inpatient visits that resulted from the 
* ED visit (we assumed this by checking). This variable contains missing values.

gen Y_charges = charg_tot_ed

* Define Z
* Z (the instrument) is defined as winning the lottery. The variable used to 
* determine whether and individual won or lost the lottery is called "treatment"
* from the raw OHIE data. This variable is described as "Selected in the lottery" 
* in the OHIE documentation.
gen Z = (treatment == 1)
	
* Define D		
* D (the endogenous variable) is a binary variable indicating whether the 
* individual had Medicaid from the matched notification date until September 30, 
* 2009. This is the endogenous variable used in the Taubman 2014 results and is 
* defined using the variable ohp_all_ever_firstn_30sep2009 in the raw OHIE data.
gen any_medicaid = ohp_all_ever_firstn_30sep2009
		
* Define Y_pre	
* All pre-period variables are measured from January 1, 2007 to March 9, 2008. 
* Each outcome variable has a corresponding pre-period measure. The pre-period 
* variables were censored/truncated in a similar manner as the study period 
* variables (see the OHIE documentation for more detail). The pre-period 
* variables for number of ER visits and ER total charges contain missing values. 
* The pre-period variable for any ER visit does not contain any missing values.	
gen Y_any_pre = any_visit_pre_ed
gen Y_num_pre = num_visit_pre_cens_ed
gen Y_charges_pre = charg_tot_pre_ed
gen Y_num_pre0 = (Y_num_pre == 0) if Y_num_pre!=.
gen Y_num_pre1 = (Y_num_pre == 1) if Y_num_pre!=.
gen Y_num_pre2 = (Y_num_pre == 2) if Y_num_pre!=.
gen Y_num_pre3 = (Y_num_pre == 3) if Y_num_pre!=.
gen Y_num_pre1_3 = (Y_num_pre == 1 |Y_num_pre == 2 | Y_num_pre == 3) if Y_num_pre!=.
gen Y_num_pre2_3 = (Y_num_pre == 2 | Y_num_pre == 3) if Y_num_pre!=.
gen Y_num_pre4 = (Y_num_pre > 3) if Y_num_pre!=.


* Define Y_pre_bi
* Similar to Y_pre, but =1 if number of pre-period visits is greater than or
* equal to a given level (binary variables for cumulative distribution, 
* rather than binary indicators of categorical variables)
gen Y_pre_bi1 = (Y_num_pre >= 1) if Y_num_pre!=.
gen Y_pre_bi2 = (Y_num_pre >= 2) if Y_num_pre!=.
gen Y_pre_bi3 = (Y_num_pre >= 3) if Y_num_pre!=.
gen Y_pre_bi4 = (Y_num_pre >= 4) if Y_num_pre!=.


			
* Label all Y and Y-pre variables	
label var Y_any 	"Any ER visit in the study period"
label var Y_num 	"Number of ER visits in the study period (Censored)"
label var Y_charges 	"Total charges (ER and non-ER) in the study period (Censored)"
label var Y_any_pre	"Any ER visits in the pre-period"
label var Y_num_pre	"Number of ER visits in the pre-period (Censored)"
label var Y_num_pre0 	"Indicates no pre-period ER visits"
label var Y_num_pre1 	"Indicates one pre-period ER visit"
label var Y_num_pre2	"Indicates 2 pre-period ER visits"
label var Y_num_pre3	"Indicates 3 pre-period ER visits"
label var Y_num_pre4	"Indicates 4+ pre-period ER visits"
label var Y_num_pre1_3   "Indicates 1-3 pre-period ER visits"
label var Y_num_pre2_3   "Indicates 2-3 pre-period ER visits"
label var Y_charges_pre	"Total charges (ER and non-ER) in the pre-period (Censored)"
label var Y_pre_bi1	"Indicates one or more pre-period ER visits"
label var Y_pre_bi2	"Indicates two or more pre-period ER visits"
label var Y_pre_bi3	"Indicates three or more pre-period ER visits"
label var Y_pre_bi4	"Indicates four or more pre-period ER visits"

	
* Label the Z and D variables		
label var Z 		"Selected in the lottery"
label var any_medicaid 	"Any Medicaid insurance (from matched notification date until Sept. 30, 2009)"
		
* Generate variables for the covariates

* Female
gen female = female_list

* English-speaker
* This variable indicates whether the individual requested English-language 
* materials during the lottery signup
gen english = english_list

* Age
* This is an integer variable and since there is no age variable in the raw OHIE 
* data, we define age as [2009 - year of birth].
gen age = 2009 - birthyear_list	

* Self-reported health from in-person interview, coded as in Allen et al. (2010)
g selfhealth_fair =  health_gen_bin_0m
gen selfhealth = health_gen_0m 

* Frequency weights
* In the OHIE data, the frequency weights equal 1 for all individuals (i.e., 
* there are no weights). The reason this variable is included is because it 
* facilitates flexibility of the codes - the codes we use for the BRFSS data, 
* which does have frequency weights, can be used with the OHIE data with this 
* "dummy" weight variable. This is particularly relevant for the extrapolation.
gen byte w = 1

* Number of lottery entrants
* This variable can have the following values only: (1, 2, 3)
gen numhh = numhh_list

* Label covariates
label var age		"Age in 2009"
label var female	"Female"
label var english	"English-speaker"
label var w		"Frequency weight"
label var numhh		"Number of lottery entrants in household"
label var selfhealth_fair "Self-reported health fair or poor"
label var selfhealth "Self-reported health"
	
* Keep select variables
keep  `varlist'
order `varlist'
		
* Create binary variables for female, age, and English, and all two-way
* interactions between these covariates - also called common covariates. All 
* common covariates should begin with _X*. The common covariates are the only 
* covariates for which we create individual binary variables for continuous 
* variables such as for age - all other covariates are preserved in their 
* original form (binary, categorical, or continuous)

* Create a list of all common covariates to convert into binary variables
foreach X of local X_byte {
	local Xs "`Xs' i.`X'"
}
foreach X of local X_cont {
	local Xs "`Xs' i.`X'"
}

* Create a list of all two-way interactions between age, gender, and English
local i = 1
foreach X1 of local Xs {
	local j = 1
	foreach X2 of local Xs {
		if `i' < `j' {
			local _X "`_X' `X1'*`X2'"
		}
		local ++j
	}
	local ++i
}

* Construct interactions using xi
xi `_X'

* Rename variables so that "ds _X*" returns the list of common covariates
renvars _I*, presub(_I _X)
				
* Create interaction terms of the common covariates (see point above for 
* definition of common covariates) with Z 
* The interactions between the common covariates and Z should all begin with _Z*.
foreach cov of varlist _X* {

	gen Z`cov' = `cov'*`Z'
	
}
		

* Rename interaction terms so that "ds _Z* returns the list of interactions 
* between common controls and Z
renvars Z_*, presub (Z _Z)	


* Interactions between Z and the pre-period ER utilization variables
gen _pre_Z_Y_any = Y_any_pre*Z
gen _pre_Z_Y_num = Y_num_pre*Z
gen _pre_Z_Y_charges = Y_charges_pre*Z

gen _pre_Z_Y_num0 = Y_num_pre0*Z
gen _pre_Z_Y_num1 = Y_num_pre1*Z
gen _pre_Z_Y_num2 = Y_num_pre2*Z
gen _pre_Z_Y_num3 = Y_num_pre3*Z
gen _pre_Z_Y_num4 = Y_num_pre4*Z
gen _pre_Z_Y_num1_3 = Y_num_pre1_3*Z
gen _pre_Z_Y_num2_3 = Y_num_pre2_3*Z



* Interaction between Z and the first_day_list variable
* The first_day_list variable is a binary variable indicating whether the 
* individual signed up for the lottery on the first day possible
gen _add_Z_first_day_list = first_day_list*Z

* Interaction between Z and the snap_ever_prenotify07 variable
* The snap_ever_prenotify07 variable is a binary variable indicating whether the 
* individual was ever personally on SNAP between January 1, 2007 and the lottery 
* notification date (the pre-period)
gen _add_Z_snap_ever = snap_ever_prenotify07*Z

* Interaction between Z and the tanf_ever_prenotify07 variable
* The tanf_ever_prenotify07 variable is a binary variable indicating whether the 
* individual was ever personally on TANF between January 1, 2007 and the 
* lottery notification date (the pre-period)
gen _add_Z_tanf_ever = tanf_ever_prenotify07*Z	

* Generate flags for nonmissing pre-period utilization
gen Y_any_pre_nonmiss_flag = (Y_any_pre!=.)
gen Y_num_pre_nonmiss_flag = (Y_num_pre!=.)
gen Y_charges_pre_nonmiss_flag = (Y_charges_pre!=.)

label var Y_any_pre_nonmiss_flag "(binary) Non-missing any ER visits, pre-period"
label var Y_num_pre_nonmiss_flag "(binary) Non-missing number of ER visits, pre-period"
label var Y_charges_pre_nonmiss_flag "(binary) Non-missing ER total charges, pre-period"

* Generate new variables for pre-period utilization that replace missing values
* with zeroes
	* Any ER visits, pre-period
	gen Y_any_pre_nonmiss = Y_any_pre
	replace Y_any_pre_nonmiss = 0 if Y_any_pre_nonmiss_flag==0
	label var Y_any_pre_nonmiss "Non-missing any ER visits, pre-period"
	
	* Number of ER visits, pre-period
	gen Y_num_pre_nonmiss = Y_num_pre
	replace Y_num_pre_nonmiss = 0 if Y_num_pre_nonmiss_flag==0
	label var Y_num_pre_nonmiss "Non-missing number of ER visits, pre-period"
	
	* ER total charges, pre-period
	gen Y_charges_pre_nonmiss = Y_charges_pre
	replace Y_charges_pre_nonmiss = 0 if Y_charges_pre_nonmiss_flag==0
	label var Y_charges_pre_nonmiss "Non-missing ER total charges, pre-period"
	
* Interact the non-missing pre-period utilization dummies and variables with Z
gen _pre_nonmiss_Z_Y_any_flag = Z*Y_any_pre_nonmiss_flag
gen _pre_nonmiss_Z_Y_num_flag = Z*Y_num_pre_nonmiss_flag
gen _pre_nonmiss_Z_Y_charges_flag = Z*Y_charges_pre_nonmiss_flag
gen _pre_nonmiss_Z_Y_any = Z*Y_any_pre_nonmiss
gen _pre_nonmiss_Z_Y_num = Z*Y_num_pre_nonmiss
gen _pre_nonmiss_Z_Y_charges =Z*Y_charges_pre_nonmiss
		
* Finalize and save the analytic data set for the full sample

compress
tostring person_id, replace			

save "$final/oregonpooled", replace
save "$final_analytic/oregonpooled", replace
		
* Finalize and save the analytic data set for the 1 lottery entrant sample

preserve

* Drop individuals that are not from a 1 lottery entrant household
drop if numhh != 1		

save "$final/oregonnumhh1", replace
save "$final_analytic/oregonnumhh1", replace	

timer off 1
timer list 1
local hours = `r(t1)'/3600
di "Computing time is `hours' hours"
	
qui log close
